import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from IPython.display import display
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.figure_factory as ff
import seaborn as sns
train= pd.read_csv(r'C:\Users\Einav bezalel\PycharmProjects\Datathon2021\input\TrainingWiDS2021.csv')
test= pd.read_csv(r'C:\Users\Einav bezalel\PycharmProjects\Datathon2021\input\UnlabeledWiDS2021.csv')
dictionary= pd.read_csv(r'C:\Users\Einav bezalel\PycharmProjects\Datathon2021\input\DataDictionaryWiDS2021.csv')
train.head()
test.head()
dictionary.head(10)
train.shape
test.shape
train.nunique().sort_values(ascending=False)#Count distinct observations over requested axis.defult is 0 (over the column)
# we can see that Unnamed: 0 + encounter_id are completly unique-> need to be dropped
#notice also that readmission_status has only one value (0) so we'll drop it
test.nunique().sort_values(ascending=False) #same thing for test
#we suspect that hospital_id comes from different distributions in test and train, let's cheack it:
if np.any((train['hospital_id']).isin(test['hospital_id'])):#should give True if at least one value is in test. o.w False
print("train's hospital_id is in test")
elif np.any((test['hospital_id']).isin(train['hospital_id'])):
print("test's hospital_id is in train")
else:
print("hospital_id is completly different between the sets") #if so, we need to drop it
train.drop(['Unnamed: 0', 'encounter_id','readmission_status','hospital_id'],axis=1, inplace=True)
test.drop(['Unnamed: 0', 'encounter_id','readmission_status','hospital_id'],axis=1 , inplace=True)
len(dictionary['Variable Name'])
len(train.columns)
#there is one item which is not in train, we want to find it:
variables_of_dict= dictionary['Variable Name']
variables_of_train= train.columns
print(variables_of_dict[~variables_of_dict.isin(variables_of_train)])
dictionary.drop([0,1,10,15],axis=0, inplace=True) #'encounter_id', hospital_id,icu_admit_type,readmission_status
print((dictionary['Variable Name'] != variables_of_train).sum())#so the columns matches if it's zero
target_col='diabetes_mellitus'
train[target_col].value_counts()
fig = go.Figure()
counts=train[target_col].value_counts(normalize=True)*100
x = ['No Diabetes', 'Diabetes']
y= [int(counts[0]), int(counts[1])]
color=['rgb(0,0,225)','rgb(255,0,0)' ]
fig.add_trace(go.Bar(x=x, y=y,hovertext=['{:d}%'.format(y[0]), '{:d}%'.format(y[1])], width=0.5,
marker=dict(color=color,line_color='rgb(179,179,179)',line_width=2,opacity=0.6)))
fig.update_layout(showlegend=False,
title={'text': "Percentage of Patients by Target [%]",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.update_yaxes(title_text='Percentage [%]', title_standoff = 30)
fig.show()
print('Proportion: ', round(counts[0] /counts[1], 2), ":1")
def plot_heatmap(columns_list, null=True):
df=train[columns_list]
df = df.iloc[:,[i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0]] # Remove completely filled or completely empty variables.
if null:
corr_mat = df.isnull().corr()# Create and mask the correlation matrix. Construct the base heatmap.
title = 'Correlation Matrix of Nulls'
else:
corr_mat = df.corr() #NOTICE: IT REMOVES NON NUMERIC FEATURS AUTOMATECLLY!
title = 'Correlation Matrix of (Null is excluded)'
mask = np.zeros_like(corr_mat,dtype = np.bool) #creats a "False" 2d array of size of corr_mat
mask[np.triu_indices_from(mask)] = True #riu_indices_from: Return the indices for the upper-triangle of arr, set it to True because corr_mat is equal(mirror like)
corr1=corr_mat.mask(mask)# df.mask(cond,othe=nan):Where cond is False, keep the original value. Where True, replace with corresponding value from other(which is set to nan)
X = corr1.columns.values
pio.templates.default = "none" #to disable the defult of the grey background of plotly
hovertext = np.around(corr1, decimals=2)
fig = go.Figure()
heat = go.Heatmap(z=corr1,
x=X,
y=X,
xgap=1, ygap=1,
colorscale='RdBu',
colorbar_thickness=20,
colorbar_ticklen=3,
zmid=0)
layout = go.Layout(title_text=title, title_x=0.5,
width=750, height=600,
xaxis_showgrid=False,
yaxis_showgrid=False,
yaxis_autorange='reversed',
margin=dict(
l=180,
r=50,
b=180,
t=100))
fig.add_trace(heat)
fig.update_layout(layout)
fig.update_yaxes(tickangle = -20)
fig.update_xaxes(tickangle = -290)
fig.show()
sort_corr=corr1.stack().sort_values(ascending=False).reset_index()
print(sort_corr[sort_corr[0] > 0.5])# presents all pairs that have "corr-nan" higher than 0.5...
categories= dictionary['Category'].unique()
categories[:-1]
for col in categories[:-1]:
col_names= col.split(" ")
col_new='_'.join(col_names)
locals()['_'.join(col_names)] = dictionary['Variable Name'].loc[dictionary['Category'] == col].values
demographic
APACHE_covariate
vitals
labs
labs_blood_gas
APACHE_comorbidity
#example for demographic:
plot_heatmap(demographic, False)
plot_heatmap(demographic)
#example for vitals:
plot_heatmap(vitals, False)
plot_heatmap(vitals)
pio.templates.default = "plotly" #return to the defult template
train_groupby=train.groupby("gender")
variables=["weight","height"]
for var in variables:
locals() ["male_" + var ]= train_groupby[var].get_group('M')
locals() ["female_" + var ]=train_groupby[var].get_group('F')
colors = ['lightseagreen','indianred']
#the curve is a kde (by defult distribution), can also do norm...probability density is by defult
fig = ff.create_distplot([male_weight.dropna(inplace=False) , female_weight.dropna(inplace=False) ], ["Male","Female"], show_hist=False, bin_size=.25,colors=colors, show_rug=False)
fig.update_xaxes(title_text= 'weight [kg]', range=[35, 190])
fig.update_yaxes(title_text='probability density', title_standoff = 30)
fig.update_layout(title_text='Weight Distribution')
fig.show()
#the curve is a kde (by defult distribution), can also do norm...probability density is by defult
fig = ff.create_distplot([male_height.dropna(inplace=False) , female_height.dropna(inplace=False) ], ["Male","Female"], show_hist=False, bin_size=.25,colors=colors, show_rug=False)
fig.update_xaxes(title_text= 'height [cm]', range=[135, 196])
fig.update_yaxes(title_text='probability density', title_standoff = 30)
fig.update_layout(title_text='Height Distribution')
fig.show()
fig = make_subplots(rows=1, cols=2, subplot_titles=('Weight [kg]','Height [cm]'))
fig.add_trace(go.Box(y=male_weight, name='Male',
marker_color = 'lightseagreen',
boxmean='sd'),row=1, col=1)# represent mean and standard deviation)
fig.add_trace(go.Box(y=female_weight, name = 'Female',
marker_color = 'indianred',
boxmean='sd'),row=1, col=1)
fig.add_trace(go.Box(y=male_height, name='Male',
marker_color = 'lightseagreen',
boxmean='sd',showlegend=False),row=1, col=2)# represent mean and standard deviation)
fig.add_trace(go.Box(y=female_height, name = 'Female',
marker_color = 'indianred',
boxmean='sd',showlegend=False),row=1, col=2)
fig.show()
train["weight"] = np.where((train.height.isna() & (train.gender == 'F')),female_weight.mode() , train["weight"])
train["weight"] = np.where((train.height.isna() & (train.gender == 'M')),male_weight.mode() , train["weight"])
train["height"] = np.where((train.height.isna() & (train.gender == 'F')),female_height.mode() , train["height"])
train["height"] = np.where((train.height.isna() & (train.gender == 'M')),male_height.mode() , train["height"])
test["weight"] = np.where((test.height.isna() & (test.gender == 'F')),female_weight.mode() , test["weight"])
test["weight"] = np.where((test.height.isna() & (test.gender == 'M')),male_weight.mode() , test["weight"])
test["height"] = np.where((test.height.isna() & (test.gender == 'F')),female_height.mode() , test["height"])
test["height"] = np.where((test.height.isna() & (test.gender == 'M')),male_height.mode() , test["height"])
#if all of them (gender, weight, height) are nans:
train[train.gender.isna() & train.height.isna() & train.weight.isna()]#18 rows
test[test.gender.isna() & test.height.isna() & test.weight.isna()]#0 rows
#we'll drop them
train = train[~(train.gender.isna() & train.height.isna() & train.weight.isna())].reset_index(drop=True)
fig = px.histogram(train, x="age", histnorm='probability density', color=target_col)
fig.update_yaxes(title_text='probability density', title_standoff = 30)
fig.show()
(test["age"].loc[test.age == 0].value_counts()/test.shape[0])*100 #no values
(train["age"].loc[train.age == 0].value_counts()/train.shape[0])*100 #0.023049%
#only 0.023049% in the train are equal to zero, so let's drop them:
train.loc[train.age == 0, 'age'] = np.nan
test.loc[test.age == 0, 'age'] = np.nan
tar= train[target_col].replace([0,1],["No", "Yes"])
fig = px.scatter(train, x="weight", y="bmi", color=tar,
marginal_x="box", marginal_y="box",
title="Weight Vs BMI")
fig.update_layout(
xaxis_title="Weight [kg]",
yaxis_title="BMI",
legend_title="Diabese")
fig.show()
#we'll show an example of the distributions of the two classes in the first two columns:
tar= train[target_col].replace([0,1],["No Diabese", "Diabese"])
fig = px.scatter(train, x="age", y="bmi", color=tar, title="age Vs bmi")
fig.update_layout(
title="Age Vs. BMI",
xaxis_title="Age [years]",
yaxis_title="BMI",
legend_title="Target")
fig.show()
#we can see an overlay between the classes...
#click on the legend to see it separately !